********************************************************************************
* Create neighborhood characteristics

program define create_neighborhoods_characteristics

{

****
* Census data
	* Read in R from Census Bureau API

* 2000
import delimited census2000_final.csv, clear

* Replace NA with . and destring

des

foreach tempvar of varlist _all {

	capture noisily replace `tempvar' = "" if `tempvar' == "NA"
	destring `tempvar', replace

}

keep state county tract tot_* sh_* med_*

* Convert from 2000 dollars to 2017 dollars
	* Historical CPI, Table 24
	* https://www.bls.gov/cpi/tables/historical-cpi-u-201712.pdf

summ med_*
foreach tempvar of varlist med_* {
	replace `tempvar' = `tempvar' * (245.120 / 172.2)
}
summ med_*


**
* Create final tract variable

* State
gen statestring = string(state)
replace statestring = "0" + statestring if strlen(statestring) == 1
tab statestring, missing

* County
gen countystring = string(county)
replace countystring = "00" + countystring if strlen(countystring) == 1
replace countystring = "0" + countystring if strlen(countystring) == 2
tab countystring, missing

* Tract
gen tractstring = string(tract)
replace tractstring = "00000" + tractstring if strlen(tractstring) == 1
replace tractstring = "0000" + tractstring if strlen(tractstring) == 2
replace tractstring = "000" + tractstring if strlen(tractstring) == 3
replace tractstring = "00" + tractstring if strlen(tractstring) == 4
replace tractstring = "0" + tractstring if strlen(tractstring) == 5

gen tractfinal = statestring + countystring + tractstring
drop *string

rename tractfinal tractstring
order tractstring
gen check = strlen(tractstring)
tab check, missing
assert check == 11
drop check

drop state county tract

rename tractstring tract

gen year = 2000

* Keep only necessary variables
keep year tract tot_ind tot_hh sh_coll sh_minority sh_poverty med_hhinc med_rent med_value

* Check
summ tot_ind tot_hh sh_coll sh_minority sh_poverty med_hhinc med_rent med_value

foreach tempvar in sh_coll sh_minority sh_poverty {
	replace `tempvar' = `tempvar' * 100
}

* Reshape to wide
foreach tempvar of varlist tot_* sh_* med_* {
	rename `tempvar' `tempvar'_
}

reshape wide tot_* sh_* med_*, i(tract) j(year)

order tract
gsort tract

save temp_2000.dta, replace




**
* 2010
	* Read in R from Census Bureau API
import delimited acs2010_final.csv, clear

* Replace NA with . and destring

des

foreach tempvar of varlist _all {

	capture noisily replace `tempvar' = "" if `tempvar' == "NA"
	destring `tempvar', replace

}

keep state county tract tot_* sh_* med_*

* Convert from 2010 dollars to 2017 dollars
	* Historical CPI, Table 24
	* https://www.bls.gov/cpi/tables/historical-cpi-u-201712.pdf

summ med_*
foreach tempvar of varlist med_* {
	replace `tempvar' = `tempvar' * (245.120 / 218.056)
}
summ med_*


**
* Create final tract variable

* State
gen statestring = string(state)
replace statestring = "0" + statestring if strlen(statestring) == 1
tab statestring, missing

* County
gen countystring = string(county)
replace countystring = "00" + countystring if strlen(countystring) == 1
replace countystring = "0" + countystring if strlen(countystring) == 2
tab countystring, missing

* Tract
gen tractstring = string(tract)
replace tractstring = "00000" + tractstring if strlen(tractstring) == 1
replace tractstring = "0000" + tractstring if strlen(tractstring) == 2
replace tractstring = "000" + tractstring if strlen(tractstring) == 3
replace tractstring = "00" + tractstring if strlen(tractstring) == 4
replace tractstring = "0" + tractstring if strlen(tractstring) == 5

gen tractfinal = statestring + countystring + tractstring
drop *string

rename tractfinal tractstring
order tractstring
gen check = strlen(tractstring)
tab check, missing
assert check == 11
drop check

drop state county tract

rename tractstring tract

gen year = 2010

* Manual tract change in LA
	* # 2011: Some name changes in Madison County, NY. No boundary changes. https://www.census.gov/programs-surveys/acs/technical-documentation/table-and-geography-changes/2011/geography-changes.html
	* # 2012: Some name changes in AZ and CA. No boundary changes. https://www.census.gov/programs-surveys/acs/technical-documentation/table-and-geography-changes/2012/geography-changes.html
replace tract = "06037137000" if tract == "06037930401"

* Keep only necessary variables
keep year tract tot_ind tot_hh sh_coll sh_minority sh_poverty med_hhinc med_rent med_value

* Check
summ tot_ind tot_hh sh_coll sh_minority sh_poverty med_hhinc med_rent med_value

foreach tempvar in sh_coll sh_minority sh_poverty {
	replace `tempvar' = `tempvar' * 100
}

* Reshape to wide
foreach tempvar of varlist tot_* sh_* med_* {
	rename `tempvar' `tempvar'_
}

reshape wide tot_* sh_* med_*, i(tract) j(year)

order tract
gsort tract

save temp_2010.dta, replace




**
* 2014
	* Read in R from Census Bureau API

import delimited acs2014_final.csv, clear

* Replace NA with . and destring

des

foreach tempvar of varlist _all {

	capture noisily replace `tempvar' = "" if `tempvar' == "NA"
	destring `tempvar', replace

}

keep state county tract tot_* sh_* med_*

* Convert from 2014 dollars to 2017 dollars
	* Historical CPI, Table 24
	* https://www.bls.gov/cpi/tables/historical-cpi-u-201712.pdf

summ med_*
foreach tempvar of varlist med_* {
	replace `tempvar' = `tempvar' * (245.120 / 236.736)
}
summ med_*


**
* Create final tract variable

* State
gen statestring = string(state)
replace statestring = "0" + statestring if strlen(statestring) == 1
tab statestring, missing

* County
gen countystring = string(county)
replace countystring = "00" + countystring if strlen(countystring) == 1
replace countystring = "0" + countystring if strlen(countystring) == 2
tab countystring, missing

* Tract
gen tractstring = string(tract)
replace tractstring = "00000" + tractstring if strlen(tractstring) == 1
replace tractstring = "0000" + tractstring if strlen(tractstring) == 2
replace tractstring = "000" + tractstring if strlen(tractstring) == 3
replace tractstring = "00" + tractstring if strlen(tractstring) == 4
replace tractstring = "0" + tractstring if strlen(tractstring) == 5

gen tractfinal = statestring + countystring + tractstring
drop *string

rename tractfinal tractstring
order tractstring
gen check = strlen(tractstring)
tab check, missing
assert check == 11
drop check

drop state county tract

rename tractstring tract

gen year = 2014

* Manual tract change in LA
	* # 2011: Some name changes in Madison County, NY. No boundary changes. https://www.census.gov/programs-surveys/acs/technical-documentation/table-and-geography-changes/2011/geography-changes.html
	* # 2012: Some name changes in AZ and CA. No boundary changes. https://www.census.gov/programs-surveys/acs/technical-documentation/table-and-geography-changes/2012/geography-changes.html
replace tract = "06037137000" if tract == "06037930401"

* Keep only necessary variables
keep year tract tot_ind tot_hh sh_coll sh_minority sh_poverty sh_attend_collgrad med_hhinc med_rent med_value

* Check
summ tot_ind tot_hh sh_coll sh_minority sh_poverty sh_attend_collgrad med_hhinc med_rent med_value

foreach tempvar in sh_coll sh_minority sh_poverty {
	replace `tempvar' = `tempvar' * 100
}

* Reshape to wide
foreach tempvar of varlist tot_* sh_* med_* {
	rename `tempvar' `tempvar'_
}

reshape wide tot_* sh_* med_*, i(tract) j(year)

order tract
gsort tract

save temp_2014.dta, replace




**
* 2017
	* Read in R from Census Bureau API
import delimited acs2017_final.csv", clear

* Replace NA with . and destring

des

foreach tempvar of varlist _all {

	capture noisily replace `tempvar' = "" if `tempvar' == "NA"
	destring `tempvar', replace

}

keep state county tract tot_* sh_* med_*


**
* Create final tract variable

* State
gen statestring = string(state)
replace statestring = "0" + statestring if strlen(statestring) == 1
tab statestring, missing

* County
gen countystring = string(county)
replace countystring = "00" + countystring if strlen(countystring) == 1
replace countystring = "0" + countystring if strlen(countystring) == 2
tab countystring, missing

* Tract
gen tractstring = string(tract)
replace tractstring = "00000" + tractstring if strlen(tractstring) == 1
replace tractstring = "0000" + tractstring if strlen(tractstring) == 2
replace tractstring = "000" + tractstring if strlen(tractstring) == 3
replace tractstring = "00" + tractstring if strlen(tractstring) == 4
replace tractstring = "0" + tractstring if strlen(tractstring) == 5

gen tractfinal = statestring + countystring + tractstring
drop *string

rename tractfinal tractstring
order tractstring
gen check = strlen(tractstring)
tab check, missing
assert check == 11
drop check

drop state county tract

rename tractstring tract

gen year = 2017

* Keep only necessary variables
keep year tract tot_ind tot_hh sh_coll sh_minority sh_poverty med_hhinc med_rent med_value

* Check
summ tot_ind tot_hh sh_coll sh_minority sh_poverty med_hhinc med_rent med_value

foreach tempvar in sh_coll sh_minority sh_poverty {
	replace `tempvar' = `tempvar' * 100
}

* Reshape to wide
foreach tempvar of varlist tot_* sh_* med_* {
	rename `tempvar' `tempvar'_
}

reshape wide tot_* sh_* med_*, i(tract) j(year)

order tract
gsort tract

save temp_2017.dta, replace




**
* Crosswalk 2000 data using 2010 CBSA and central city boundaries from LTDB
use "$input_dir/crosswalk_2000_2010.dta, clear

isid trtid00 trtid10

rename trtid00 tract2000
rename trtid10 tract2010

summ weight
tab changetype, missing

egen check = sum(weight), by(tract2000)
summ check, d
drop check

rename weight share_2000_to_2010

keep tract2000 tract2010 share_2000_to_2010

* Merge with Census 2000
rename tract2000 tract

gsort tract
merge m:1 tract using temp_2000.dta
tab _merge
	* _merge == 2: In Census 2000 and not LTDB

summ tot_ind_2000 tot_hh_2000 if _merge == 2, d
	* All zeros
	* Ignore

keep if _merge == 3

count

assert ~missing(tot_ind_2000)

gen ind = tot_ind_2000 * share_2000_to_2010
summ ind, d

egen temp = sum(ind), by(tract2010)
gen share_2010_from_2000 = ind / temp
summ share_2010_from_2000, d
drop temp

* Save all
save temp_2000_all.dta, replace


**
* Convert totals and shares
	* share_2000_to_2010
	* Collapse

use temp_2000_all.dta, clear

summ tot_* sh_*
* foreach tempvar of varlist tot_* sh_* {
* 	replace `tempvar' = `tempvar' * share_2000_to_2010
* }

collapse (sum) tot_* (mean) sh_* [pweight=share_2000_to_2010], by(tract2010)
* collapse (sum) tot_* (mean) sh_*, by(tract2010)

summ tot_* sh_*

gsort tract2010
save temp_2000_totals.dta, replace


**
* Convert medians
use temp_2000_all.dta, clear

summ med_*
* foreach tempvar of varlist tot_* sh_* {
* 	replace `tempvar' = `tempvar' * share_2000_to_2010
* }

collapse (mean) med_* [pweight=share_2010_from_2000], by(tract2010)

summ med_*


**
* Merge with totals and shares
gsort tract2010
merge 1:1 tract2010 using temp_2000_totals.dta
tab _merge
	* Some missing medians
	* Fine
drop _merge

order med_*, last

rename tract2010 tract

gsort tract
save temp_2000_tract2010.dta, replace




**
* Add 2010 geo to data sets for all years
	* Just keep 2010 boundary definitions and merge
use "$input_dir/crosswalk_2000_2010.dta, clear

isid trtid00 trtid10

rename trtid10 tract2010

bysort tract2010: keep if _n == 1

drop trtid00

drop weight changetype
drop metdiv10

rename placefp10 place2010
rename cbsa10 cbsa2010
rename ccflag10 ccflag2010

rename tract2010 tract

order tract cbsa2010 ccflag2010 place2010

gsort tract
save temp_geo2010.dta, replace




**
* Merge 2010 with 2017 to diagnose non-matched tracts
	* Mostly name changes, described in R program
	* Then introduce manual changes before merge

	* From R program
	* # There actually are a small number of 2010 tract revisions. Details here:
		* # 2010: Decadal change from 2000. Fine. https://www.census.gov/programs-surveys/acs/technical-documentation/table-and-geography-changes/2010/geography-changes.html
	* # 2011: Some name changes in Madison County, NY. No boundary changes. https://www.census.gov/programs-surveys/acs/technical-documentation/table-and-geography-changes/2011/geography-changes.html
	* # 2012: Some name changes in AZ and CA. No boundary changes. https://www.census.gov/programs-surveys/acs/technical-documentation/table-and-geography-changes/2012/geography-changes.html
	* # 2013: None
	* # 2014: None
	* # 2015: None
	* # 2016: None
	* # 2017: None

use temp_2000_tract2010.dta, clear

* Add 2010
gsort tract
merge 1:1 tract using temp_2010.dta
tab _merge

rename _merge merge_2000_2010


* Add 2014
gsort tract
merge 1:1 tract using temp_2014.dta
tab _merge

list tract _merge if _merge ~= 3

* None of the remaining are in our sample
	* Ignore

rename _merge merge_20002010_2014

* Add 2017
gsort tract
merge 1:1 tract using temp_2017.dta
tab _merge

list tract _merge if _merge ~= 3

* None of the remaining are in our sample
	* Ignore

rename _merge merge_200020102014_2017

* Merge geo 2010
gsort tract
merge 1:1 tract using temp_geo2010.dta
tab _merge
	* Interesting
	* Quite a few obs in Census tract (Census API) data but NOT 2010 LTDB data

tab merge_2000_2010 _merge, missing
tab merge_20002010_2014 _merge, missing
tab merge_200020102014_2017 _merge, missing

rename _merge merge_census_ltdb

gen state = substr(tract, 1, 2)
	* Check which states and counties these are in
tab state if merge_census_ltdb == 1, missing
rename state state2010

rename tract tract2010

* Keep all for merge with buildings data
order tract2010 state2010 cbsa2010 ccflag2010 state2010 place2010 merge_*


**
* CBSA and tract sample restrictions

* Keep only our CBSAs
gen keepflag = 0
foreach tempvar in 12060 12420 16980 19740 31100 35620 37980 38900 41860 42660 47900 {
	replace keepflag = 1 if cbsa2010 == `tempvar'
}
tab cbsa2010 if keepflag == 1, missing
keep if keepflag == 1
drop keepflag

* Re- check merge problems after restricting to CBSAs we care about
tab1 merge_*, missing
	* Few problems

	* And don't have problems of missing tracts for these CBSAs

* Exclude if not in Census tract data
drop if merge_census_ltdb == 2


tab1 merge_*, missing


**
tab ccflag2010, missing


**
* Tract 2010 population >= 500
summ tot_ind_2010, d
count if tot_ind_2010 < 500
summ med_* if tot_ind_2010 < 500

****
* Create CBSA-level measures and tract changes before merge with buildings
bysort cbsa2010: gen unique_cbsa = (_n == 1)
tab unique_cbsa, missing

* CBSA total population and households
des tot_*
foreach tempvar of varlist tot_* {
	egen cbsa_`tempvar' = sum(`tempvar'), by(cbsa2010)
}
summ cbsa_tot_ind_*


**
* Create changes from
	* 2000 to 2010
	* 2010 to 2014
	* 2010 to 2017

	* Percent changes for medians
	* Differences for shares

foreach tempvar in med_hhinc med_rent med_value {
	gen `tempvar'_pctch2010 = 100 * (`tempvar'_2010 - `tempvar'_2000) / (0.5 * (`tempvar'_2010 + `tempvar'_2000))
	gen `tempvar'_pctch2014 = 100 * (`tempvar'_2014 - `tempvar'_2010) / (0.5 * (`tempvar'_2014 + `tempvar'_2010))
	gen `tempvar'_pctch2017 = 100 * (`tempvar'_2017 - `tempvar'_2010) / (0.5 * (`tempvar'_2017 + `tempvar'_2010))
}

foreach tempvar in sh_coll sh_poverty sh_minority {
	gen `tempvar'_diff2010 = (`tempvar'_2010 - `tempvar'_2000)
	gen `tempvar'_diff2014 = (`tempvar'_2014 - `tempvar'_2010)
	gen `tempvar'_diff2017 = (`tempvar'_2017 - `tempvar'_2010)
}

* All CBSA
summ *_pctch* *_diff*


**
* Tract deciles of 2000 and 2010 and 2014 values within CBSA distribution
	* Central city and suburbs

levelsof cbsa2010, local(all) clean
foreach tempvar in med_hhinc med_rent med_value sh_coll sh_poverty sh_minority {

	foreach temptype in 2000 2010 2014 {

		gen `tempvar'_`temptype'_qcbsa = .

		foreach tempcbsa in `all' {

			* Deciles
			xtile temp = `tempvar'_`temptype' if cbsa2010 == `tempcbsa', nquantiles(10)
			replace `tempvar'_`temptype'_qcbsa = temp if cbsa2010 == `tempcbsa' & ~missing(`tempvar'_`temptype')
			drop temp

		}

	}

}

tab1 med_hhinc_2000_qcbsa med_hhinc_2010_qcbsa med_hhinc_2014_qcbsa, missing


**
* Tract values within CBSA (adjusted for CBSA mean of medians)

isid tract2010
tab1 cbsa2010 ccflag2010, missing

foreach tempvar in sh_coll med_hhinc med_rent med_value sh_poverty sh_minority {

	foreach temptype in 2000 2010 2014 2017 {

		egen temp = mean(`tempvar'_`temptype'), by(cbsa2010)

		gen `tempvar'_`temptype'_adj = `tempvar'_`temptype' - temp

		drop temp

	}

}

summ *_2000_adj *_2010_adj *_2014_adj *_2017_adj




**
* Adjust changes as well
foreach tempvar in sh_coll_diff ///
	sh_poverty_diff ///
	sh_minority_diff ///
	med_hhinc_pctch ///
	med_rent_pctch ///
	med_value_pctch {

	foreach tempyear in 2010 2014 2017 {

		egen temp = mean(`tempvar'`tempyear'), by(cbsa2010)

		gen `tempvar'`tempyear'_adj = `tempvar'`tempyear' - temp

		drop temp

	}

}

summ *diff*_adj *pctch*_adj




****
* Flag college tracts


**
* 2010-2014 ACS
rename sh_attend_collgrad_2014 sh_student_2014


**
* 2006-2010 ACS
summ sh_coll_2010, d

* Merge with 2010 data from before
gsort tract2010
merge m:1 tract2010 using "$input_dir/temp_edu.dta
tab _merge
assert _merge ~= 1
keep if _merge == 3
drop _merge

gen sh_student_2010 = edu_enrolled / edu_total
summ sh_student_2010, d

foreach tempyear in 2010 2014 {

	label variable sh_student_`tempyear' "Share currently in college or graduate school"

	gen student_tract_`tempyear' = sh_student_`tempyear' >= 0.25 if ~missing(sh_student_`tempyear')
	label variable student_tract_`tempyear' "25% or more currently in college or graduate school"
	
	tab student_tract_`tempyear', missing
	tab tot_ind_`tempyear' if missing(sh_student_`tempyear')

}

gen diff = sh_student_2014 - sh_student_2010
summ diff, d
drop diff

drop edu_total edu_enrolled

order sh_student_2010 student_tract_2010 sh_student_2010 student_tract_2010, before(tot_ind_2000)

gsort tract2010
save "$results_dir/census_tract_cbsa_chars_final.dta, replace


}

end

********************************************************************************
* clean the new buildings

program define new_buildings

{

clear all
set more off

* start with geocoded buildings from RCA
use rca_geocoded_final, clear
rename obs_id id
gen base_id = id 
drop if final_lat==.
rename (final_lat final_lon rcamarkets_tx) (latitude longitude city)
keep if inlist(city_tx, "Atlanta", "Austin", "Brooklyn", "Chicago", "Denver")==1 | ///
	inlist(city_tx, "Los Angeles", "Philadelphia", "Portland", "Seattle","San Francisco", "Washington")==1
	
gen met=lower(city_tx)
replace met="la" if met=="los angeles"
replace met="sf" if met=="san francisco"
replace met="dc" if met=="washington"

gen month=month(Estimated)
gen check_year=year(Estimated)

replace units="" if units=="NULL"
replace sqft_nb="" if sqft_nb=="NULL"
replace land_area="" if land_area=="NULL"
destring units_dbl  sqft land_area, replace
drop estimated

rename (units_dbl sqft_nb land_area) (units sqft land_area)

* apply some filters
drop if units<25
drop if units==.
keep if features=="" | features == "subsidized" | features == "subsidized/townhouse" | features == "townhouse"

save rca_cleaned, replace

* make a version with fewer characteristics for memory intensive things
keep met latitude longitude id base_id
save short_apartments, replace

* add neighborhood characteristics
use rca_cleaned,clear

* get the correct state variables merged in
statastates, a(state) 
rename state_fips STATEFP
keep if _merge==3
drop _merge
tostring STATEFP, format(%02.0f) replace  

geoinpoly latitude longitude using AllStatesCoords.dta
merge m:1 _ID STATEFP using AllStatesChars.dta, gen(match_merge)
keep if match_merge==3

* merge in tract characteristics
gen tract_2010 = GEOID
merge m:1 tract_2010 using census_tract_cbsa_chars_final, keepus(total_pop rent_2br med_hh_inc inc_pc pct_black pct_white pct_college pct_owner pct_vacant pct_poverty pct_200k)
rename (rent_2br med_hh_inc inc_pc pct_black pct_white pct_college pct_owner pct_vacant pct_poverty pct_200k) ///
	(rent_2br_building med_hh_inc_building inc_pc_building pct_black_building pct_white_building pct_college_building pct_owner_building per_vac_building ///
	poverty_rate_building pct_200k_building)
rename _merge tract_char_merge
rename total_pop total_pop_building

drop if tract_char_merge==2

drop _ID COUNTYFP TRACTCE NAME NAMELSAD MTFCC FUNCSTAT INTPTLAT INTPTLON

gen building_id = id
rename longitude building_lon
rename latitude building_lat
rename postalcode_tx building_zip
rename yearblt_nb building_built
rename units building_units
rename sqft building_sqft
rename land_area building_land_area
rename features_tx building_features
rename month building_month
rename GEOID building_tract
rename ALAND building_tract_land_area
rename *_building building_*
rename met msa

drop final_source state state_name STATEFP AWATER
drop base_id city rcametros_tx propertyname_tx address_tx city_tx county_tx  

gen building_density=building_total_pop/building_tract_land_area

save rca_characteristics, replace

}

end

********************************************************************************
* count the number of other new buildings near each new building

program define count_nearby

{

* loop over distances and create separate counts for buildings completed at different times
local dist_list .25 .4 .5 .75 1

foreach dist in `dist_list'{

	local dist_label=`dist'*1000
	use short_apartments, clear
	geonear base_id latitude longitude using short_apartments, n(id latitude longitude) ignore within(`dist') long
	gen count_`dist_label'=1
	
	merge m:1 id using rca_cleaned, keepus(yearblt_nb)
	rename yearblt_nb built_nearby
	keep if _merge==3
	drop _merge
	
	merge m:1 base_id using rca_cleaned, keepus(yearblt_nb)
	rename yearblt_nb built_building
	keep if _merge==3
	drop _merge	
		
	gen count_`dist_label'_built=(built_nearby<=built_building)
	
	keep if km_to_id<`dist'
	collapse (sum) count_`dist_label'*, by (base_id)
	rename base_id id
	save apartment_distances_`dist_label', replace
	
}

* merge the different distance files together
use rca_cleaned,clear
keep id

merge 1:1 id using apartment_distances_250
drop _merge
replace count_250=0 if count_250==.
replace count_250_built=0 if count_250_built==.

merge 1:1 id using apartment_distances_400
drop _merge
replace count_400=0 if count_400==.
replace count_400_built=0 if count_400_built==.

merge 1:1 id using apartment_distances_500
drop _merge
replace count_500=0 if count_500==.
replace count_500_built=0 if count_500_built==.

merge 1:1 id using apartment_distances_750
drop _merge
replace count_750=0 if count_750==.
replace count_750_built=0 if count_750_built==.

merge 1:1 id using apartment_distances_1000
drop _merge
replace count_1000=0 if count_1000==.
replace count_1000_built=0 if count_1000_built==.

rename count_* b_nearby_*
save apartment_distances, replace

}

end


********************************************************************************
* combine new building and neighborhood characteristic files to make data sets
* for summary statistic tables (1, 2, A3)

program define building_neighborhood_char

{

clear
set more off
set varabbrev off


****
* Read final sample of buildings
use rca_characteristics, clear


**
gen collegegrad_tract = (share_enrolled_collegegrad >= 0.25) if ~missing(share_enrolled_collegegrad)
gen college_tract = (share_enrolled_college >= 0.25) if ~missing(share_enrolled_college)

tab collegegrad_tract college_tract, missing

**
* Flag our central cities
	* Identify using statefips - placefips of buildings
destring state2010, replace
summ state2010 place2010

bysort state2010 place2010: gen unique_stateplace = (_n == 1)
tab unique_stateplace, missing

gen stateplace = state2010 * 100000 + place2010
summ stateplace
bysort stateplace: gen check = (_n == 1)
tab check, missing

drop unique_stateplace check

tab building, missing
drop building

* Flag buildings 50+
capture noisily drop units50

summ units
gen units50 = (units >= 50) if ~missing(units)
tab units50, missing
tab units50

gen withbuilding = (~missing(obs_id))
tab withbuilding, missing

tab city_tx if withbuilding == 1, missing
tab stateplace if withbuilding == 1, missing
tab stateplace city_tx if withbuilding == 1, missing

drop state2010 place2010
replace stateplace = 1304000 if withbuilding == 1 & city_tx == "Atlanta"
replace stateplace = 4805000 if withbuilding == 1 & city_tx == "Austin"
replace stateplace = 3651000 if withbuilding == 1 & inlist(city_tx, "Bronx", "Brooklyn", "New York", "Queens")
replace stateplace = 1714000 if withbuilding == 1 & city_tx == "Chicago"
replace stateplace = 820000 if withbuilding == 1 & city_tx == "Denver"
replace stateplace = 644000 if withbuilding == 1 & city_tx == "Los Angeles"
replace stateplace = 4260000 if withbuilding == 1 & city_tx == "Philadelphia"
replace stateplace = 4159000 if withbuilding == 1 & city_tx == "Portland"
replace stateplace = 667000 if withbuilding == 1 & city_tx == "San Francisco"
replace stateplace = 5363000 if withbuilding == 1 & city_tx == "Seattle"
replace stateplace = 1150000 if withbuilding == 1 & city_tx == "Washington"

tab stateplace city_tx if withbuilding == 1, missing

egen maxwithbuilding = max(withbuilding), by(stateplace)
tab maxwithbuilding, missing
	* These are all tracts in cities / places with at least one RCA building
		* Including non-major places
tab stateplace if maxwithbuilding == 1, missing

gen maincc = (maxwithbuilding == 1)
tab maincc, missing

tab city maincc, missing

* Restrict maincc to only Brooklyn for New York CBSA

gen brooklyn = (substr(tract2010, 1, 5) == "36047")
tab city brooklyn, missing

* 1052 should also be in Brooklyn
list rcametros address_tx city_tx county_tx yearbuilt final_source final_lat final_lon rca_maincc_sample cbsa2010 ccflag2010 brooklyn maincc tract2010 if obs_id == 1052
	* Check where this lat / lon is
		* Greenpoint, basically in water
		* Looks correct
		* Adjust manually

replace brooklyn = 1 if obs_id == 1052

replace maincc = 0 if city == "newyork" & brooklyn == 0

tab city maincc, missing

drop maxwithbuilding

capture noisily drop unique_tract
bysort tract2010: gen unique_tract = (_n == 1)
tab unique_tract, missing
tab city if unique_tract == 1

**
* Label city values
tab city, missing
replace city = "brooklyn" if city == "newyork"

encode city, gen(citycode)
tab citycode, missing
tab citycode, missing nolabel
tab city citycode, missing nolabel

label define temp 1 "Atlanta" 2 "Austin" 3 "Brooklyn" 4 "Chicago" 5 "Denver" 6 "Los Angeles" 7 "Philadelphia" 8 "Portland" 9 "San Francisco" 10 "Seattle" 11 "Washington, DC"
label values citycode temp
tab citycode, missing
tab city citycode, missing

* Create building variables
tab yearbuilt, missing

gen builtany = withbuilding
gen built20152016 = (inlist(yearbuilt, 2015, 2016))
gen built20142015 = (inlist(yearbuilt, 2014, 2015))

tab isolated, missing
gen isolated_20152016 = (isolated == 1 & inlist(yearbuilt, 2015, 2016))
gen isolated_20142015 = (isolated == 1 & inlist(yearbuilt, 2014, 2015))

gen building_id = obs_id

gsort building_id
save temp_for_building_sample_merge.dta, replace

****
* Create data set for Tables 1 and 2

**
* Merge final building sample flags from analysis samples

use mig_nf250_buildings.dta, clear

append using mig_nn250_buildings.dta

append using rent_nf250_buildings.dta

append using rent_nn250_buildings

bysort building_id: keep if _n == 1

tab building_built, missing

rename building_built building_year

gen final_sample = 1

gsort building_id
save final_buildings.dta, replace


**
use temp_for_building_sample_merge.dta, clear

gen temp = 1
egen check = sum(temp), by(building_id)
tab check, missing
drop temp check

tab allsample, missing

gsort building_id
merge m:1 building_id using final_buildings.dta
tab _merge if ~missing(building_id) & allsample == 1

assert _merge ~= 2 if ~missing(building_id) & allsample == 1

rename _merge merge_`tempvar'

tab final_sample, missing

replace final_sample = 0 if missing(final_sample)

assert ~missing(final_sample)

* 1052 not in a main cc for some reason
list rcametros address_tx city_tx county_tx yearbuilt final_source final_lat final_lon rca_maincc_sample cbsa2010 ccflag2010 brooklyn maincc if obs_id == 1052

* Building-level table by city

keep if maincc == 1

tab city, missing

replace city = "brooklyn" if city == "newyork"

tab built20152016, missing

gen missid = (missing(building_id))

tab allsample missid, missing

replace allsample = 0 if missing(allsample)

tab allsample missid, missing




**
* Label variables

* Levels

foreach tempyear in 2000 2010 2017 {

	* College
	label variable sh_coll_`tempyear' "College `tempyear'"
	label variable sh_coll_`tempyear'_adj "College `tempyear'"
	if `tempyear' ~= 2017 label variable sh_coll_`tempyear'_qcbsa "College `tempyear'"

	* Income
	label variable med_hhinc_`tempyear' "Income `tempyear'"
	label variable med_hhinc_`tempyear'_adj "Income `tempyear'"
	if `tempyear' ~= 2017 	label variable med_hhinc_`tempyear'_qcbsa "Income `tempyear'"

	* Rent
	label variable med_rent_`tempyear' "Rent `tempyear'"
	label variable med_rent_`tempyear'_adj "Rent `tempyear'"
	if `tempyear' ~= 2017 	label variable med_rent_`tempyear'_qcbsa "Rent `tempyear'"

	* House value
	label variable med_value_`tempyear' "House `tempyear'"
	label variable med_value_`tempyear'_adj "House `tempyear'"
	if `tempyear' ~= 2017 	label variable med_value_`tempyear'_qcbsa "House `tempyear'"

}

* Changes
	* 2000 to 2010
	* 2010 to 2017

foreach lastyear in 2010 2017 {

	if `lastyear' == 2010 local firstyear 2000
	if `lastyear' == 2017 local firstyear 2010

	* College
	label variable sh_coll_diff`lastyear' "College `firstyear' to `lastyear'"
	label variable sh_coll_diff`lastyear'_adj "College `firstyear' to `lastyear'"

	* Income
	label variable med_hhinc_pctch`lastyear' "Income `firstyear' to `lastyear'"
	label variable med_hhinc_pctch`lastyear'_adj "Income `firstyear' to `lastyear'"

	* Rent
	label variable med_rent_pctch`lastyear' "Rent `firstyear' to `lastyear'"
	label variable med_rent_pctch`lastyear'_adj "Rent `firstyear' to `lastyear'"

	* House value
	label variable med_value_pctch`lastyear' "House `firstyear' to `lastyear'"
	label variable med_value_pctch`lastyear'_adj "House `firstyear' to `lastyear'"

}

des sh_coll* med_hhinc* med_rent* med_value*


**
* Define all cities
global allcities atlanta austin brooklyn chicago denver losangeles philadelphia portland sanfrancisco seattle washington


**
* Define neighborhood gentrification variables

	* Unadjusted and adjusted
		* Adjusted is by CBSA, before restricting to main CC only

	* College, unadjusted
	global coll_unadj sh_coll_2000 sh_coll_2010 sh_coll_2017 sh_coll_diff2010 sh_coll_diff2017

		* College, adjusted
		global coll_adj sh_coll_2000_adj sh_coll_2010_adj sh_coll_2017_adj sh_coll_diff2010_adj sh_coll_diff2017_adj

	* Income, unadjusted
	global inc_unadj med_hhinc_2000 med_hhinc_2010 med_hhinc_2017 med_hhinc_pctch2010 med_hhinc_pctch2017

		* Income, adjusted
		global inc_adj med_hhinc_2000_adj med_hhinc_2010_adj med_hhinc_2017_adj med_hhinc_pctch2010_adj med_hhinc_pctch2017_adj

	* Rent, unadjusted
	global rent_unadj med_rent_2000 med_rent_2010 med_rent_2017 med_rent_pctch2010 med_rent_pctch2017

		* Rent, adjusted
		global rent_adj med_rent_2000_adj med_rent_2010_adj med_rent_2017_adj med_rent_pctch2010_adj med_rent_pctch2017_adj

	* House value, unadjusted
	global house_unadj med_value_2000 med_value_2010 med_value_2017 med_value_pctch2010 med_value_pctch2017

		* House value, adjusted
		global house_adj med_value_2000_adj med_value_2010_adj med_value_2017_adj med_value_pctch2010_adj med_value_pctch2017_adj

foreach tempvar in coll inc rent house {

	display _newline
	display "Current: `tempvar'"
	summ ${`tempvar'_unadj} ${`tempvar'_adj}

}




****
* Rename other interesting building chars
rename land_area landarea

* Check for buildings need to add
count if obs_id == 1052
count if obs_id == 3251

gsort obs_id
save temp_table_all_almost.dta, replace




****
* Merge nearby mean rents from zillow data
use mean_building_rents.dta, clear

rename building_id obs_id

isid obs_id

gsort obs_id
merge 1:m obs_id using temp_table_all_almost.dta
tab _merge

assert _merge ~= 1

tab allsample _merge, missing

* Drop
tab tract2010 if allsample == 1 & _merge == 2
tab allsample _merge if tract2010 == "48453001908"

drop if allsample == 1 & _merge == 2

tab allsample _merge, missing

rename _merge merge_rent_data

summ rent_*

order rent_*, last

* Create p25, p50, and p75 versions for collapse
	* Always 600 distance
	* All and 1br
foreach tempvar in rent_600 rent_1br_600 units {
	foreach temptype in 25 50 75 {
		gen `tempvar'_p`temptype' = `tempvar'
	}
}

* Check others
summ units sqft landarea, d




****
* Final sample checks

gen lowinc = (med_hhinc_2017_adj < 0)
tab lowinc, missing


**
* Low-income tracts
tab final_sample lowinc, missing

tab city if final_sample == 1 & lowinc == 0, missing

tab student_tract if final_sample == 1 & lowinc == 0, missing

tab tract2010 if final_sample == 1 & lowinc == 0, missing

	tab lowinc if tract2010 == "41051003401", missing

	replace lowinc = 1 if tract2010 == "41051003401"

	tab final_sample lowinc, missing


**
* Student tracts
tab final_sample student_tract, missing

tab final_sample collegegrad_tract, missing

tab final_sample college_tract, missing

drop student_tract

rename college_tract student_tract

* Samples in each city
tab city final_sample, missing
tab city final_sample if lowinc == 1, missing
tab city final_sample if lowinc == 1 & student_tract == 0, missing

tab city final_sample if lowinc == 1 & student_tract == 0 & yearbuilt < 2018, missing
tab city final_sample if lowinc == 1 & student_tract == 0 & yearbuilt >= 2018, missing

* save file for tables 1 and 2
save data_for_tables_1_2.dta, replace


****
* Create data for Table A.3


**
* Prep ACS 2014 data
	* Read in R from Census API

import delimited using $data_dir/acs2014.csv, clear

* Some variables have negative values
	* Fix later

destring _all, replace ignore("NA")

des


**
* Variable descriptions from Census: 

	* List of variables from R code: 

* # All 2018                      # Group   # Variables

* # Individuals                   \ b01003_001e,
gen tot_ind = b01003_001e
summ tot_ind, d

* # Households                    \ b11001_001e,
gen tot_hh = b11001_001e
summ tot_hh, d

* # Individual education 25+      \ b15003_001e, b15003_022e, b15003_023e, b15003_024e, b15003_025e,
gen sh_coll = (b15003_022e + b15003_023e + b15003_024e + b15003_025e) / b15003_001e
summ sh_coll, d
assert inrange(sh_coll, 0, 1) if ~missing(sh_coll)

* # Individual race / ethnicity   \	b03002_001e, b03002_002e, b03002_003e, b03002_004e, b03002_005e, b03002_006e, b03002_007e, b03002_008e, b03002_009e, b03002_010e, b03002_011e, b03002_012e,
gen denom = b03002_001e

gen sh_latino = b03002_012e / denom

gen sh_white = b03002_003e / denom
gen sh_black = b03002_004e / denom
gen sh_aa = b03002_006e / denom
gen sh_other = (b03002_005e + b03002_007e + b03002_008e + b03002_009e) / denom

gen diff = 1 - sh_latino - sh_white - sh_black - sh_aa - sh_other
summ diff, d
assert abs(diff) < .00001 if ~missing(diff)
drop diff
drop denom


**
* Income and poverty variables

* # Individual poverty            \	b17001_001e, b17001_002e,
gen sh_poverty = b17001_002e / b17001_001e
summ sh_poverty
assert inrange(sh_poverty, 0, 1) if ~missing(sh_poverty)

* # Household median income       \	b19013_001e,
gen hh_medinc = b19013_001e
summ hh_medinc, d


**
* Housing variables: rent and own

* # Tenure                        \ b25003_001e, b25003_002e, b25003_003e,
gen sh_renter = b25003_003e / b25003_001e
summ sh_renter, d
assert inrange(sh_renter, 0, 1) if ~missing(sh_renter)


**
* Rent variables

* # Household median rent:        \ b25064_001e,
gen hh_medrent = b25064_001e
summ hh_medrent, d

* # Median rent As A Percentage Of Household Income In The Past 12 Months    
	* \ b25071_001e,
gen med_rent_pctinc = b25071_001e
summ med_rent_pctinc, d

* # Rent As A Percentage Of Household Income In The Past 12 Months
	* \ b25070_001e, b25070_002e, b25070_003e, b25070_004e, b25070_005e, b25070_006e, b25070_007e, b25070_008e, b25070_009e, b25070_010e, b25070_011e,

summ b25070_011e, d

gen sh_rent_pctinc_30 = (b25070_007e + b25070_008e + b25070_009e + b25070_010e) / (b25070_001e)
gen sh_rent_pctinc_50 = (b25070_010e) / (b25070_001e)
summ sh_rent_pctinc_*, d

assert inrange(sh_rent_pctinc_30, 0, 1) if ~missing(sh_rent_pctinc_30)
assert inrange(sh_rent_pctinc_50, 0, 1) if ~missing(sh_rent_pctinc_50)


**
* House value variables

* # Household median value:       \	b25077_001e,
gen hh_medvalue = b25077_001e
summ hh_medvalue, d

* # Mortgage                      \ b25081_001e, b25081_002e, b25081_003e, b25081_004e, b25081_005e, b25081_006e, b25081_007e, b25081_008e,
gen sh_mortgage = b25081_002e / b25081_001e
summ sh_mortgage, d
assert inrange(sh_mortgage, 0, 1) if ~missing(sh_mortgage)

* # Median Selected Monthly Owner Costs (Dollars) By Mortgage Status       \ , b25088_002e, b25088_003e,
gen med_ownercost = b25088_001e
summ med_ownercost, d

* # Median Selected Monthly Owner Costs As A Percentage Of Household Income In The Past 12 Months
		* \ , b25092_002e, b25092_003e,
gen med_ownercost_pctinc = b25092_001e
summ med_ownercost_pctinc, d

* # 	Mortgage Status By Selected Monthly Owner Costs As A Percentage Of Household Income In The Past 12 Months
	* \ b25091_001e, b25091_002e, b25091_003e, b25091_004e, b25091_005e, b25091_006e, b25091_007e, b25091_008e, b25091_009e, b25091_010e, b25091_011e, b25091_012e, b25091_013e, b25091_014e, b25091_015e, b25091_016e, b25091_017e, b25091_018e, b25091_019e, b25091_020e, b25091_021e, b25091_022e, b25091_023e,

summ b25091_012e b25091_023e, d

gen sh_ownercost_pctinc_30 = (b25091_008e + b25091_009e + b25091_010e + b25091_011e + b25091_019e + b25091_020e + b25091_021e + b25091_022e) / b25091_001e
gen sh_ownercost_pctinc_50 = (b25091_011e + b25091_022e) / b25091_001e
summ sh_ownercost_pctinc_*, d

assert inrange(sh_ownercost_pctinc_30, 0, 1) if ~missing(sh_ownercost_pctinc_30)
assert inrange(sh_ownercost_pctinc_50, 0, 1) if ~missing(sh_ownercost_pctinc_50)


**
* Mobility

* # Mobility (by tenure)          \ b07013_001e, b07013_002e, b07013_003e, b07013_004e, b07013_005e, b07013_006e,
gen sh_moved_all = 1 - (b07013_004e / b07013_001e)
gen sh_moved_renters = 1 - (b07013_006e / b07013_003e)
gen sh_moved_owners = 1 - (b07013_005e / b07013_002e)
summ sh_moved_*, d


**
* Vacancy
gen hunits = b25001_001e

gen occstatus = b25002_001e

assert hunits == occstatus

gen sh_vacant = b25002_003e / occstatus
summ sh_vacant, d
assert inrange(sh_vacant, 0, 1) if ~missing(sh_vacant)

drop hunits occstatus


**
* Household income details

gen denom = b19001_001e

gen sh_hhinc_35k = (b19001_002e + b19001_003e + b19001_004e + b19001_005e + b19001_006e + b19001_007e) / denom
gen sh_hhinc_35k_200k = (b19001_008e + b19001_009e + b19001_010e + b19001_011e + b19001_012e + b19001_013e + b19001_014e + b19001_015e + b19001_016e) / denom
gen sh_hhinc_200k = b19001_017e / denom

gen check = 1 - sh_hhinc_35k - sh_hhinc_35k_200k - sh_hhinc_200k
summ check, d
assert abs(round(check, 0.00001)) == 0 if ~missing(check)
drop denom check

summ sh_hhinc_35k sh_hhinc_200k, d





**
* Fix negative values
foreach tempvar of varlist hh_medinc hh_medrent med_rent_pctinc hh_medvalue med_ownercost* {

	display "`tempvar'"
	summ `tempvar'
	
	count if `tempvar' == -666666688
	capture noisily assert `tempvar' == -666666688 if `tempvar' < 0
	replace `tempvar' = . if `tempvar' < 0

	summ `tempvar'

}


**
* Keep necessary
keep state county tract tot_ind - sh_hhinc_200k


* Convert from 2014 to 2017 dollars
	* Historical CPI, Table 24
	* https://www.bls.gov/cpi/tables/supplemental-files/historical-cpi-u-201912.pdf
foreach tempvar in hh_medinc hh_medrent hh_medvalue med_ownercost {
	summ `tempvar'
	replace `tempvar' = `tempvar' * (245.120 / 236.736)
	summ `tempvar'
}


**
* Already 2010 tracts so don't need to crosswalk


**
* Save

gsort state county tract

capture noisily drop tract2010
gen double tract2010 = state * 10^9 + county * 10^6 + tract
format tract2010 %15.0f
order tract2010

keep tract2010 sh_latino sh_black sh_white sh_aa ///
	sh_renter sh_vacant sh_poverty sh_coll ///
	hh_medinc hh_medrent hh_medvalue tot_ind

gsort tract2010
save acs2014_final.dta, replace




****
* 2010 data (2006-2010 ACS)
use census_tract_cbsa_chars_final.dta, clear

keep tract2010 sh_coll_2010 med_hhinc_2010 med_rent_2010 med_value_2010 tot_ind_2010

destring tract2010, replace

gsort tract2010
merge 1:1 tract2010 using /acs2014_final.dta
tab _merge

keep if _merge == 3
drop _merge

gsort tract2010
save /temp_for_table_3.dta, replace




****
* Read building sample with tract identifiers


**
use temp_for_building_sample_merge.dta, clear

gen temp = 1
egen check = sum(temp), by(building_id)
tab check, missing
drop temp check

tab allsample, missing

gsort building_id
merge m:1 building_id using final_buildings.dta
tab _merge if ~missing(building_id) & allsample == 1

assert _merge ~= 2 if ~missing(building_id) & allsample == 1

rename _merge merge_`tempvar'

tab final_sample, missing

replace final_sample = 0 if missing(final_sample)

assert ~missing(final_sample)

tab final_sample

keep if final_sample == 1

keep tract2010 yearbuilt final_sample


****
* Merge in characteristics for Table 3
destring tract2010, replace

gsort tract2010
merge m:1 tract2010 using temp_for_table_3.dta, keep(1 3)
tab _merge

assert _merge == 3
drop _merge


**
* Create Y that are changes from 2010 to 2014

* 2014
rename hh_medinc med_hhinc
rename hh_medrent med_rent
rename hh_medvalue med_value

summ sh_latino sh_black sh_white sh_aa ///
	sh_renter sh_vacant sh_poverty

summ sh_coll med_hhinc med_rent med_value tot_ind

* 2010
replace sh_coll_2010 = sh_coll_2010 / 100

summ sh_coll_2010 med_hhinc_2010 med_rent_2010 med_value_2010 tot_ind_2010

* Change from 2010 to 2014
foreach tempvar in sh_coll med_hhinc med_rent med_value tot_ind {
	gen `tempvar'_ch = `tempvar' - `tempvar'_2010
}
summ *_ch


**
* Define treatment and control
tab yearbuilt, missing

gen treatment = (inlist(yearbuilt, 2014, 2015, 2016))
gen control = (inlist(yearbuilt, 2018, 2019))

tab treatment control, missing


**
* Save data set for Table A.3
save data_for_table_3.dta, replace


}

end